
# coding: utf-8

# In[ ]:


get_ipython().system('pip install -U tomotopy &> /dev/null # topic modeling library')


# In[ ]:


from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))


# In[ ]:


import os
import pandas as pd
import re
import numpy as np
import tomotopy as tp
import collections

import matplotlib.pyplot as plt
import matplotlib.colors as clr
import matplotlib.font_manager as fm

get_ipython().run_line_magic('matplotlib', 'inline')

pd.set_option('display.max_colwidth', -1)


# In[ ]:


os.chdir('c:/pythonwork/')


# In[ ]:


DF = pd.read_csv('c:/pythonwork/ptm_twitter.csv', encoding='utf-8')

DF['All_out'].replace('', np.nan, inplace=True) 
DF.dropna(subset=['All_out'], inplace=True)
DF['All_out'] = DF['All_out'].str.replace(' +', ' ')

print(len(DF))


# In[ ]:


def compute_coherence_values(doc, start, limit, step):
    for i in range(start, limit, step):
        mdl = tp.PTModel(k=i, seed=7777)
        mdl.burn_in = 100
        
        for text in doc:
            token = text.strip().split()
            if token:
                mdl.add_doc(token)
                
        mdl.train(iter=500, workers=0)
        coh = tp.coherence.Coherence(mdl, coherence='c_v')
        coherence_per_topic = coh.get_score()
        perplexity_per_topic = mdl.perplexity

        print('Topic: {}\tLog-likelihood: {}\tPerplexity: {}\tCoherence: {}'        .format(
            i,
            mdl.ll_per_word,
            perplexity_per_topic,
            coherence_per_topic
            )
        )
        coherence_values.append(coherence_per_topic) 
        perplexities.append(perplexity_per_topic)


# In[ ]:


limit = 45
start = 15
step = 5

coherence_values = []
perplexities = []

compute_coherence_values(DF['All_out'], start, limit, step)


# In[ ]:


x = range(start, limit, step)

mdl_check = pd.DataFrame(list(zip(x, perplexities, coherence_values)), columns=['Num Topics', 'Perplexity', 'Coherence']) 

fig, ax = plt.subplots(figsize=(15,5), nrows=1, ncols=2) 

plt.subplots_adjust(left=0.125, 
                    bottom=0.1, 
                    right=0.9, 
                    top=0.9, 
                    wspace=0.3, 
                    hspace=0.5)
pd.pivot_table(mdl_check, values=["Perplexity", "Coherence"],index="Num Topics")    .plot(kind='line', rot=90, ax=ax, subplots=True


# In[ ]:


mdl = tp.PTModel(k=35, seed=7777)
mdl.burn_in = 100

for text in DF['All_out']:
    token = text.strip().split()
    if token:
        mdl.add_doc(token)
mdl.train(iter=500, workers=0)


# Table 1 Topic Keywords (Label is attached manually)

# In[ ]:


for i in range(mdl.k):
    res = mdl.get_topic_words(i, top_n=20)
    print('Topic #{}'.format(i), end='\t')
    print(', '.join(w for w, p in res))


# In[ ]:


top_docs = [[] for _ in range(mdl.k)] 

for doc in mdl.docs:
    top_docs[doc.get_topics(top_n=1)[0][0]].append(doc) 


# In[ ]:


topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)])
doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
doc_lengths = np.array([len(doc.words) for doc in mdl.docs])
vocab = list(mdl.used_vocabs)
term_frequency = mdl.used_vocab_freq


# In[ ]:


doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
df_topic = pd.DataFrame.from_records(doc_topic_dists) 
df_topic.columns = ['Topic 01', 'Topic 02', 'Topic 03', 'Topic 04', 'Topic 05', 'Topic 06', 'Topic 07', 'Topic 08', 'Topic 09', 'Topic 10',
                    'Topic 11', 'Topic 12', 'Topic 13', 'Topic 14', 'Topic 15', 'Topic 16', 'Topic 17', 'Topic 18', 'Topic 19', 'Topic 20',
                    'Topic 21', 'Topic 22', 'Topic 23', 'Topic 24', 'Topic 25', 'Topic 26', 'Topic 27', 'Topic 28', 'Topic 29', 'Topic 30',
                    'Topic 31', 'Topic 32', 'Topic 33', 'Topic 34', 'Topic 35']


# In[ ]:


df_tm = pd.concat([DF['yymm'], df_topic], axis=1)
df_tm.head()


# In[ ]:


test = pd.read_excel('c:/pythonwork/result.xlsx')


# In[ ]:


colname=['Topic 01', 'Topic 02', 'Topic 03', 'Topic 04', 'Topic 05', 'Topic 06', 'Topic 07', 'Topic 08', 'Topic 09', 'Topic 10',
         'Topic 11', 'Topic 12', 'Topic 13', 'Topic 14', 'Topic 15', 'Topic 16', 'Topic 17', 'Topic 18', 'Topic 19', 'Topic 20',
         'Topic 21', 'Topic 22', 'Topic 23', 'Topic 24', 'Topic 25', 'Topic 26', 'Topic 27', 'Topic 28', 'Topic 29', 'Topic 30',
         'Topic 31', 'Topic 32', 'Topic 33', 'Topic 34', 'Topic 35']
TP_number = list(range(1,36))
TP_name = ['Successful models', 'Patients and symptoms', 'Fatality in the US', 'Waves of infections', 'Testing, tracing, and treatment', 'Youth culture and pandemic',
           'Government responses', 'Youth vaccination and Jeju island', 'Infections in US Forces Korea and the military', 'Death tolls in different countries',
           'Economic crisis and recovery in the two Koreas', 'Use of technology', 'Outbreak and safety in cafe', 'Assessments of K-quarantine', 'Covid deaths in South Korea and US',
           'Protest, oppression, and human rights', 'Authoritarian and developing countries', 'Church outbreaks', 'New spikes in Seoul and schools', 'Use of drugs',
           'US-South Korea military exercise', 'Vaccination', 'Golf and golfing during pandemic', 'Election, party, and politics', 'Covid statistics and social distancing',
           'Records in the Asia-Pacific region', 'Support and regulations', 'Maryland governor', 'Economic growth and market in Asia', 'Different vaccines', 'Motor companies',
           'AI and fight against covid', 'Plasma donation from church members', 'Olympics and K-pop', 'Infections in East Asia']

data = pd.pivot_table(test, values=colname, index="yymm", aggfunc="mean")


# Figure 1. The average trend of ‘countries' responses and records’ topic

# In[ ]:


fig, ax = plt.subplots(figsize=(20, 24), nrows=4, ncols=2, sharey='col')
row = 0; col = 0; cnt = 0

plt.subplots_adjust(left=0.125,
                bottom=0.2, 
                right=0.9, 
                top=0.9, 
                wspace=0.1, 
                hspace=0.3)

custom_ylim = (0, 0.15)
plt.setp(ax, ylim=custom_ylim)

for i in [0, 2, 6, 9, 13, 14, 25, 34]:

    lo = ax[row, col]
    lo.plot(data.index, data[colname[i]], color = "black")
    lo.set_xticklabels(data.index, rotation=45, ha='right', fontsize = 13)
    lo.set_title((f'Topic {i+1}: {TP_name[i]}'), fontsize = 16, x=0.98, y=1.0, pad=-30, loc='right')
                
    if(col == 1): col = 0
    else: col = 1
    cnt += 1
    if(cnt % 2 == 0): row += 1

plt.show()


# Figure 2. The average trend of ‘infections, testing, tracing, and treatment, social distancing’ topic

# In[ ]:


fig, ax = plt.subplots(figsize=(20, 12), nrows=2, ncols=2, sharey='col')
row = 0; col = 0; cnt = 0

plt.subplots_adjust(left=0.125,
                bottom=0.2, 
                right=0.9, 
                top=0.9, 
                wspace=0.1, 
                hspace=0.3)

custom_ylim = (0, 0.15)
plt.setp(ax, ylim=custom_ylim)

for i in [3, 4, 18, 24]:

    lo = ax[row, col]
    lo.plot(data.index, data[colname[i]], color = "black")
    lo.set_xticklabels(data.index, rotation=45, ha='right', fontsize = 13)
    lo.set_title((f'Topic {i+1}: {TP_name[i]}'), fontsize = 16, x=0.98, y=1.0, pad=-30, loc='right')
                
    if(col == 1): col = 0
    else: col = 1
    cnt += 1
    if(cnt % 2 == 0): row += 1

plt.show()


# Figure 3. The average trend of ‘vaccines and vaccination’ topic

# In[ ]:


fig, ax = plt.subplots(figsize=(20, 12), nrows=2, ncols=2, sharey='col')
row = 0; col = 0; cnt = 0

plt.subplots_adjust(left=0.125,
                bottom=0.2, 
                right=0.9, 
                top=0.9, 
                wspace=0.1, 
                hspace=0.3)

custom_ylim = (0, 0.15)
plt.setp(ax, ylim=custom_ylim)

for i in [19, 21, 29]:

    lo = ax[row, col]
    lo.plot(data.index, data[colname[i]], color = "black")
    lo.set_xticklabels(data.index, rotation=45, ha='right', fontsize = 13)
    lo.set_title((f'Topic {i+1}: {TP_name[i]}'), fontsize = 16, x=0.98, y=1.0, pad=-30, loc='right')
                
    if(col == 1): col = 0
    else: col = 1
    cnt += 1
    if(cnt % 2 == 0): row += 1

ax[1, 1].set_visible(False)
plt.show()
fig.tight_layout()


# Figure 4. The average trend of ‘economic issues’ topic

# In[ ]:


fig, ax = plt.subplots(figsize=(20, 12), nrows=2, ncols=2, sharey='col')
row = 0; col = 0; cnt = 0

plt.subplots_adjust(left=0.125,
                bottom=0.2, 
                right=0.9, 
                top=0.9, 
                wspace=0.1, 
                hspace=0.3)

custom_ylim = (0, 0.15)
plt.setp(ax, ylim=custom_ylim)

for i in [10, 28]:

    lo = ax[row, col]
    lo.plot(data.index, data[colname[i]], color = "black")
    lo.set_xticklabels(data.index, rotation=45, ha='right', fontsize = 13)
    lo.set_title((f'Topic {i+1}: {TP_name[i]}'), fontsize = 16, x=0.98, y=1.0, pad=-30, loc='right')
                
    if(col == 1): col = 0
    else: col = 1
    cnt += 1
    if(cnt % 2 == 0): row += 1

ax[1, 0].set_visible(False)        
ax[1, 1].set_visible(False)
plt.show()
fig.tight_layout()


# Figure 5. The average trend of ‘US camp and the military’ topic

# In[ ]:


fig, ax = plt.subplots(figsize=(20, 12), nrows=2, ncols=2, sharey='col')
row = 0; col = 0; cnt = 0

plt.subplots_adjust(left=0.125,
                bottom=0.2, 
                right=0.9, 
                top=0.9, 
                wspace=0.1, 
                hspace=0.3)

custom_ylim = (0, 0.15)
plt.setp(ax, ylim=custom_ylim)

for i in [8, 20]:

    lo = ax[row, col]
    lo.plot(data.index, data[colname[i]], color = "black")
    lo.set_xticklabels(data.index, rotation=45, ha='right', fontsize = 13)
    lo.set_title((f'Topic {i+1}: {TP_name[i]}'), fontsize = 16, x=0.98, y=1.0, pad=-30, loc='right')
                
    if(col == 1): col = 0
    else: col = 1
    cnt += 1
    if(cnt % 2 == 0): row += 1

ax[1, 0].set_visible(False)        
ax[1, 1].set_visible(False)
plt.show()
fig.tight_layout()


# Figure 6. The average trend of ‘broadly human rights related’ topic

# In[ ]:


fig, ax = plt.subplots(figsize=(20, 12), nrows=2, ncols=2, sharey='col')
row = 0; col = 0; cnt = 0

plt.subplots_adjust(left=0.125,
                bottom=0.2, 
                right=0.9, 
                top=0.9, 
                wspace=0.1, 
                hspace=0.3)

custom_ylim = (0, 0.15)
plt.setp(ax, ylim=custom_ylim)

for i in [15, 17, 32]:

    lo = ax[row, col]
    lo.plot(data.index, data[colname[i]], color = "black")
    lo.set_xticklabels(data.index, rotation=45, ha='right', fontsize = 13)
    lo.set_title((f'Topic {i+1}: {TP_name[i]}'), fontsize = 16, x=0.98, y=1.0, pad=-30, loc='right')
                
    if(col == 1): col = 0
    else: col = 1
    cnt += 1
    if(cnt % 2 == 0): row += 1
        
ax[1, 1].set_visible(False)
plt.show()
fig.tight_layout()


# Figure 7. The average trend of ‘youth and culture’ topic

# In[ ]:


fig, ax = plt.subplots(figsize=(20, 18), nrows=3, ncols=2, sharey='col')
row = 0; col = 0; cnt = 0

plt.subplots_adjust(left=0.125,
                bottom=0.2, 
                right=0.9, 
                top=0.9, 
                wspace=0.1, 
                hspace=0.3)

custom_ylim = (0, 0.15)
plt.setp(ax, ylim=custom_ylim)

for i in [5, 7, 12, 22, 26, 33]:

    lo = ax[row, col]
    lo.plot(data.index, data[colname[i]], color = "black")
    lo.set_xticklabels(data.index, rotation=45, ha='right', fontsize = 13)
    lo.set_title((f'Topic {i+1}: {TP_name[i]}'), fontsize = 16, x=0.98, y=1.0, pad=-30, loc='right')
                
    if(col == 1): col = 0
    else: col = 1
    cnt += 1
    if(cnt % 2 == 0): row += 1
        
plt.show()
fig.tight_layout()


# Figure 8. The average trend of ‘technology’ topic

# In[ ]:


fig, ax = plt.subplots(figsize=(20, 12), nrows=2, ncols=2, sharey='col')
row = 0; col = 0; cnt = 0

plt.subplots_adjust(left=0.125,
                bottom=0.2, 
                right=0.9, 
                top=0.9, 
                wspace=0.1, 
                hspace=0.3)

custom_ylim = (0, 0.15)
plt.setp(ax, ylim=custom_ylim)

for i in [11, 30, 31]:

    lo = ax[row, col]
    lo.plot(data.index, data[colname[i]], color = "black")
    lo.set_xticklabels(data.index, rotation=45, ha='right', fontsize = 13)
    lo.set_title((f'Topic {i+1}: {TP_name[i]}'), fontsize = 16, x=0.98, y=1.0, pad=-30, loc='right')
                
    if(col == 1): col = 0
    else: col = 1
    cnt += 1
    if(cnt % 2 == 0): row += 1
        
ax[1, 1].set_visible(False)
plt.show()
fig.tight_layout()

